# Load library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Read in the data
train <- read_csv("train.csv")
## Rows: 1809 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): brand, fuel_type, ext_col, int_col, accident, transmission_type
## dbl (7): id, price, model_year, mileage, cylinders, horsepower, liters
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv("test.csv")
## Rows: 1000 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): brand, fuel_type, ext_col, int_col, accident, transmission_type
## dbl (6): id, model_year, mileage, cylinders, horsepower, liters
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Basic summary of both datasets
summary(train)
##        id           price           brand             model_year  
##  Min.   :   1   Min.   : 7.601   Length:1809        Min.   :1996  
##  1st Qu.: 732   1st Qu.: 9.616   Class :character   1st Qu.:2011  
##  Median :1460   Median :10.185   Mode  :character   Median :2015  
##  Mean   :1428   Mean   :10.139                      Mean   :2014  
##  3rd Qu.:2134   3rd Qu.:10.703                      3rd Qu.:2019  
##  Max.   :2809   Max.   :14.899                      Max.   :2024  
##     mileage        fuel_type           ext_col            int_col         
##  Min.   :   105   Length:1809        Length:1809        Length:1809       
##  1st Qu.: 32376   Class :character   Class :character   Class :character  
##  Median : 67035   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 75466                                                           
##  3rd Qu.:107500                                                           
##  Max.   :405000                                                           
##    accident           cylinders       horsepower        liters     
##  Length:1809        Min.   :4.000   Min.   : 76.0   Min.   :1.400  
##  Class :character   1st Qu.:6.000   1st Qu.:247.0   1st Qu.:2.700  
##  Mode  :character   Median :6.000   Median :306.0   Median :3.500  
##                     Mean   :6.201   Mean   :322.3   Mean   :3.725  
##                     3rd Qu.:8.000   3rd Qu.:395.0   3rd Qu.:4.700  
##                     Max.   :8.000   Max.   :760.0   Max.   :7.400  
##  transmission_type 
##  Length:1809       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
summary(test)
##        id            brand             model_year      mileage      
##  Min.   :   6.0   Length:1000        Min.   :1996   Min.   :   100  
##  1st Qu.: 668.5   Class :character   1st Qu.:2011   1st Qu.: 32648  
##  Median :1312.5   Mode  :character   Median :2016   Median : 61546  
##  Mean   :1363.5                      Mean   :2015   Mean   : 70664  
##  3rd Qu.:2044.5                      3rd Qu.:2019   3rd Qu.: 98900  
##  Max.   :2807.0                      Max.   :2024   Max.   :315000  
##   fuel_type           ext_col            int_col            accident        
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    cylinders       horsepower        liters    transmission_type 
##  Min.   :4.000   Min.   :118.0   Min.   :1.4   Length:1000       
##  1st Qu.:4.000   1st Qu.:248.0   1st Qu.:2.5   Class :character  
##  Median :6.000   Median :310.5   Median :3.5   Mode  :character  
##  Mean   :6.168   Mean   :325.9   Mean   :3.7                     
##  3rd Qu.:8.000   3rd Qu.:395.0   3rd Qu.:4.7                     
##  Max.   :8.000   Max.   :797.0   Max.   :8.1

Basic vizualization to start:

# Load libraries
library(ggplot2)
library(dplyr)

# 1. Boxplot of price by brand
ggplot(train, aes(x = reorder(brand, price, median), y = price)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Log Price Distribution by Brand", x = "Brand", y = "Log Price")

# 2. Scatterplot of mileage vs. price with smooth line
ggplot(train, aes(x = mileage, y = price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", color = "blue") +
  labs(title = "Price vs. Mileage", x = "Mileage", y = "Log Price")
## `geom_smooth()` using formula = 'y ~ x'

Based on the visualizations above: Brand appears to explain a lot of price variation (wide spread in boxplot). Mileage has a clear negative relationship with price, but is only slightly curved. I will begin with a full linear model because, It provides a clear, interpretable baseline, and most relationships appear roughly linear on the log(price) scale. We can try curved models as we go along.

# Fit the full model first
full_model <- lm(price ~ . - id, data = train)

# Model Summary
summary(full_model)
## 
## Call:
## lm(formula = price ~ . - id, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5078 -0.1837 -0.0026  0.1801  4.6391 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -8.400e+01  4.457e+00 -18.846  < 2e-16 ***
## brandAlfa                    -7.635e-02  1.563e-01  -0.489 0.625216    
## brandAudi                     9.808e-02  7.608e-02   1.289 0.197492    
## brandBMW                      5.984e-02  7.195e-02   0.832 0.405696    
## brandBuick                   -2.986e-01  1.381e-01  -2.163 0.030709 *  
## brandCadillac                 1.836e-02  8.289e-02   0.222 0.824686    
## brandChevrolet                2.877e-02  7.440e-02   0.387 0.699072    
## brandChrysler                -4.140e-01  1.134e-01  -3.650 0.000270 ***
## brandDodge                   -7.169e-02  8.518e-02  -0.842 0.400097    
## brandFord                     1.639e-02  7.177e-02   0.228 0.819358    
## brandGenesis                 -1.427e-01  1.678e-01  -0.850 0.395176    
## brandGMC                      8.482e-02  8.514e-02   0.996 0.319258    
## brandHonda                    2.385e-01  9.442e-02   2.526 0.011629 *  
## brandHummer                   6.041e-01  1.485e-01   4.069 4.93e-05 ***
## brandHyundai                 -2.865e-01  8.789e-02  -3.259 0.001139 ** 
## brandINFINITI                -1.765e-01  8.773e-02  -2.011 0.044435 *  
## brandJaguar                  -8.805e-02  9.508e-02  -0.926 0.354497    
## brandJeep                     2.502e-01  7.994e-02   3.130 0.001779 ** 
## brandKia                     -6.667e-02  9.572e-02  -0.697 0.486181    
## brandLand                     2.040e-01  7.992e-02   2.553 0.010772 *  
## brandLexus                    2.692e-01  7.633e-02   3.527 0.000431 ***
## brandLincoln                 -6.327e-02  9.364e-02  -0.676 0.499352    
## brandMaserati                 2.232e-01  1.152e-01   1.937 0.052913 .  
## brandMazda                    1.184e-01  1.086e-01   1.091 0.275424    
## brandMercedes-Benz            1.644e-01  7.233e-02   2.273 0.023124 *  
## brandMINI                    -1.583e-01  1.111e-01  -1.425 0.154407    
## brandMitsubishi               1.382e-01  1.191e-01   1.160 0.246313    
## brandNissan                  -2.479e-02  8.058e-02  -0.308 0.758364    
## brandOther                    4.175e-01  9.055e-02   4.611 4.29e-06 ***
## brandPontiac                  4.313e-02  1.323e-01   0.326 0.744487    
## brandPorsche                  6.164e-01  7.686e-02   8.019 1.93e-15 ***
## brandRAM                     -1.955e-02  9.984e-02  -0.196 0.844788    
## brandSubaru                   1.695e-01  8.876e-02   1.909 0.056380 .  
## brandToyota                   2.848e-01  7.527e-02   3.784 0.000159 ***
## brandVolkswagen               2.777e-03  9.458e-02   0.029 0.976580    
## brandVolvo                   -1.803e-02  1.157e-01  -0.156 0.876207    
## model_year                    4.661e-02  2.206e-03  21.126  < 2e-16 ***
## mileage                      -5.515e-06  2.062e-07 -26.746  < 2e-16 ***
## fuel_typeE85 Flex Fuel       -5.965e-01  6.533e-02  -9.131  < 2e-16 ***
## fuel_typeGasoline            -6.339e-01  5.451e-02 -11.628  < 2e-16 ***
## fuel_typeHybrid              -5.448e-01  6.539e-02  -8.332  < 2e-16 ***
## ext_colBlack                  2.461e-02  7.162e-02   0.344 0.731125    
## ext_colBlue                   2.889e-02  7.424e-02   0.389 0.697176    
## ext_colBrown                  4.677e-02  9.259e-02   0.505 0.613495    
## ext_colGold                  -7.381e-02  9.903e-02  -0.745 0.456205    
## ext_colGray                   1.539e-02  7.330e-02   0.210 0.833700    
## ext_colGreen                  1.129e-01  8.872e-02   1.272 0.203396    
## ext_colOrange                 3.578e-02  1.031e-01   0.347 0.728583    
## ext_colOther                  2.700e-01  1.683e-01   1.604 0.108799    
## ext_colRed                    8.907e-02  7.515e-02   1.185 0.236048    
## ext_colSilver                 1.358e-02  7.417e-02   0.183 0.854779    
## ext_colWhite                  8.024e-02  7.144e-02   1.123 0.261546    
## ext_colYellow                 7.110e-02  1.159e-01   0.614 0.539577    
## int_colBlack                  2.103e-02  2.502e-02   0.841 0.400635    
## int_colBlue                   1.231e-01  9.542e-02   1.290 0.197288    
## int_colBrown                  6.206e-02  4.292e-02   1.446 0.148376    
## int_colGray                  -7.022e-03  3.125e-02  -0.225 0.822233    
## int_colOther                  1.328e-01  6.516e-02   2.038 0.041732 *  
## int_colRed                    8.500e-02  4.554e-02   1.867 0.062135 .  
## int_colWhite                  5.106e-02  5.255e-02   0.972 0.331390    
## accidentNone reported         6.767e-02  1.846e-02   3.666 0.000254 ***
## cylinders                     2.130e-02  1.652e-02   1.289 0.197481    
## horsepower                    2.734e-03  1.521e-04  17.973  < 2e-16 ***
## liters                        6.743e-03  1.942e-02   0.347 0.728487    
## transmission_typeA/T and M/T  2.315e-02  2.514e-02   0.921 0.357363    
## transmission_typeCVT         -3.029e-02  5.705e-02  -0.531 0.595570    
## transmission_typeM/T          1.181e-01  3.008e-02   3.927 8.94e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3397 on 1742 degrees of freedom
## Multiple R-squared:  0.8329, Adjusted R-squared:  0.8265 
## F-statistic: 131.5 on 66 and 1742 DF,  p-value: < 2.2e-16
# Testing initial generalizability aiming for future improvement
library(boot)
set.seed(100)
cv_full_model <- cv.glm(data = train, glmfit = full_model, K = 10)
cv_full_model$delta[1]   # Estimated test MSE
## [1] NaN
sqrt(cv_full_model$delta[1])  # Estimated test RMSE
## [1] NaN
# Apply backward stepwise selection using AIC
backward_model <- step(full_model, direction = "backward")
## Start:  AIC=-3840.69
## price ~ (id + brand + model_year + mileage + fuel_type + ext_col + 
##     int_col + accident + cylinders + horsepower + liters + transmission_type) - 
##     id
## 
##                     Df Sum of Sq    RSS     AIC
## - ext_col           12     2.127 203.14 -3845.6
## - int_col            7     1.219 202.23 -3843.7
## - liters             1     0.014 201.02 -3842.6
## - cylinders          1     0.192 201.20 -3841.0
## <none>                           201.01 -3840.7
## - transmission_type  3     1.876 202.89 -3829.9
## - accident           1     1.550 202.56 -3828.8
## - fuel_type          3    16.225 217.24 -3706.3
## - horsepower         1    37.275 238.28 -3535.0
## - brand             35    56.755 257.76 -3460.8
## - model_year         1    51.501 252.51 -3430.1
## - mileage            1    82.543 283.55 -3220.3
## 
## Step:  AIC=-3845.64
## price ~ brand + model_year + mileage + fuel_type + int_col + 
##     accident + cylinders + horsepower + liters + transmission_type
## 
##                     Df Sum of Sq    RSS     AIC
## - int_col            7     1.077 204.21 -3850.1
## - liters             1     0.037 203.18 -3847.3
## - cylinders          1     0.125 203.26 -3846.5
## <none>                           203.14 -3845.6
## - transmission_type  3     1.835 204.97 -3835.4
## - accident           1     1.569 204.71 -3833.7
## - fuel_type          3    16.026 219.16 -3714.3
## - horsepower         1    38.046 241.18 -3537.1
## - brand             35    57.607 260.74 -3464.0
## - model_year         1    53.467 256.61 -3425.0
## - mileage            1    85.613 288.75 -3211.5
## 
## Step:  AIC=-3850.08
## price ~ brand + model_year + mileage + fuel_type + accident + 
##     cylinders + horsepower + liters + transmission_type
## 
##                     Df Sum of Sq    RSS     AIC
## - liters             1     0.048 204.26 -3851.7
## - cylinders          1     0.094 204.31 -3851.3
## <none>                           204.21 -3850.1
## - accident           1     1.481 205.69 -3839.0
## - transmission_type  3     1.971 206.19 -3838.7
## - fuel_type          3    16.378 220.59 -3716.5
## - horsepower         1    39.739 243.95 -3530.4
## - brand             35    58.095 262.31 -3467.2
## - model_year         1    55.500 259.71 -3417.2
## - mileage            1    87.623 291.84 -3206.2
## 
## Step:  AIC=-3851.66
## price ~ brand + model_year + mileage + fuel_type + accident + 
##     cylinders + horsepower + transmission_type
## 
##                     Df Sum of Sq    RSS     AIC
## <none>                           204.26 -3851.7
## - cylinders          1     0.570 204.83 -3848.6
## - accident           1     1.505 205.77 -3840.4
## - transmission_type  3     1.977 206.24 -3840.2
## - fuel_type          3    19.974 224.24 -3688.9
## - horsepower         1    42.374 246.64 -3512.6
## - brand             35    58.537 262.80 -3465.8
## - model_year         1    55.497 259.76 -3418.9
## - mileage            1    87.595 291.86 -3208.1

The full linear model actually generalizes surprisingly well. Backward selection suggests removing predictors like exterior color and engine size because they added little predictive value once other variables were considered. Key variables like brand, model year, mileage, and horsepower were kept because dropping them significantly worsened model performance.

# Backward selection model
# Fit as linear to start
backward_model <- lm(price ~ brand + model_year + mileage + fuel_type + accident + cylinders + horsepower + transmission_type, data = train)

# Model summary
summary(backward_model)
## 
## Call:
## lm(formula = price ~ brand + model_year + mileage + fuel_type + 
##     accident + cylinders + horsepower + transmission_type, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5225 -0.1849 -0.0053  0.1802  4.6525 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -8.444e+01  4.332e+00 -19.494  < 2e-16 ***
## brandAlfa                    -3.859e-02  1.546e-01  -0.250 0.802984    
## brandAudi                     1.009e-01  7.497e-02   1.345 0.178649    
## brandBMW                      7.094e-02  7.070e-02   1.003 0.315818    
## brandBuick                   -3.251e-01  1.373e-01  -2.368 0.018014 *  
## brandCadillac                 1.785e-02  8.227e-02   0.217 0.828238    
## brandChevrolet                4.454e-02  7.308e-02   0.610 0.542253    
## brandChrysler                -4.201e-01  1.131e-01  -3.713 0.000211 ***
## brandDodge                   -6.011e-02  8.399e-02  -0.716 0.474308    
## brandFord                     1.224e-02  7.147e-02   0.171 0.864000    
## brandGenesis                 -1.288e-01  1.666e-01  -0.773 0.439515    
## brandGMC                      9.710e-02  8.411e-02   1.154 0.248480    
## brandHonda                    2.552e-01  9.418e-02   2.710 0.006795 ** 
## brandHummer                   6.193e-01  1.460e-01   4.241 2.34e-05 ***
## brandHyundai                 -2.936e-01  8.748e-02  -3.357 0.000805 ***
## brandINFINITI                -1.822e-01  8.772e-02  -2.077 0.037930 *  
## brandJaguar                  -7.319e-02  9.391e-02  -0.779 0.435874    
## brandJeep                     2.597e-01  7.943e-02   3.269 0.001100 ** 
## brandKia                     -6.039e-02  9.547e-02  -0.633 0.527109    
## brandLand                     2.110e-01  7.915e-02   2.666 0.007740 ** 
## brandLexus                    2.859e-01  7.574e-02   3.775 0.000165 ***
## brandLincoln                 -7.234e-02  9.302e-02  -0.778 0.436850    
## brandMaserati                 2.324e-01  1.135e-01   2.048 0.040698 *  
## brandMazda                    1.446e-01  1.083e-01   1.335 0.182157    
## brandMercedes-Benz            1.598e-01  7.180e-02   2.226 0.026157 *  
## brandMINI                    -1.591e-01  1.103e-01  -1.441 0.149629    
## brandMitsubishi               1.407e-01  1.189e-01   1.184 0.236723    
## brandNissan                  -2.689e-02  8.011e-02  -0.336 0.737113    
## brandOther                    4.284e-01  8.940e-02   4.792 1.79e-06 ***
## brandPontiac                  5.045e-02  1.321e-01   0.382 0.702621    
## brandPorsche                  6.134e-01  7.638e-02   8.031 1.75e-15 ***
## brandRAM                     -1.224e-02  9.631e-02  -0.127 0.898891    
## brandSubaru                   1.597e-01  8.830e-02   1.809 0.070636 .  
## brandToyota                   2.773e-01  7.467e-02   3.713 0.000211 ***
## brandVolkswagen               7.286e-03  9.429e-02   0.077 0.938418    
## brandVolvo                   -2.965e-02  1.152e-01  -0.257 0.796947    
## model_year                    4.687e-02  2.142e-03  21.880  < 2e-16 ***
## mileage                      -5.586e-06  2.032e-07 -27.488  < 2e-16 ***
## fuel_typeE85 Flex Fuel       -6.114e-01  6.229e-02  -9.816  < 2e-16 ***
## fuel_typeGasoline            -6.473e-01  4.980e-02 -12.997  < 2e-16 ***
## fuel_typeHybrid              -5.667e-01  6.086e-02  -9.313  < 2e-16 ***
## accidentNone reported         6.633e-02  1.841e-02   3.604 0.000323 ***
## cylinders                     2.293e-02  1.034e-02   2.218 0.026692 *  
## horsepower                    2.794e-03  1.461e-04  19.119  < 2e-16 ***
## transmission_typeA/T and M/T  2.407e-02  2.503e-02   0.962 0.336338    
## transmission_typeCVT         -3.885e-02  5.689e-02  -0.683 0.494797    
## transmission_typeM/T          1.183e-01  2.969e-02   3.983 7.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3405 on 1762 degrees of freedom
## Multiple R-squared:  0.8301, Adjusted R-squared:  0.8257 
## F-statistic: 187.2 on 46 and 1762 DF,  p-value: < 2.2e-16
# Residual diagnostic plots to check for non-linearity, unequal variance, and outliers
par(mfrow = c(2, 2))
plot(backward_model)

The model shows strong fit with an adjusted R² of 0.83 and low residual standard error. Residuals are mostly homoscedastic and centered, but the slight curve in the Residuals vs. Fitted and Scale-Location plots suggests minor nonlinearity. The Q-Q plot shows mild right-skew, and a few high-leverage points appear, but none are overly influential.

# Fit the same model but replace mileage with log(mileage)
log_model <- lm(price ~ brand + model_year + log(mileage) + fuel_type + accident + cylinders + horsepower + transmission_type, data = train)

# Compare model summaries
summary(backward_model)
## 
## Call:
## lm(formula = price ~ brand + model_year + mileage + fuel_type + 
##     accident + cylinders + horsepower + transmission_type, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5225 -0.1849 -0.0053  0.1802  4.6525 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -8.444e+01  4.332e+00 -19.494  < 2e-16 ***
## brandAlfa                    -3.859e-02  1.546e-01  -0.250 0.802984    
## brandAudi                     1.009e-01  7.497e-02   1.345 0.178649    
## brandBMW                      7.094e-02  7.070e-02   1.003 0.315818    
## brandBuick                   -3.251e-01  1.373e-01  -2.368 0.018014 *  
## brandCadillac                 1.785e-02  8.227e-02   0.217 0.828238    
## brandChevrolet                4.454e-02  7.308e-02   0.610 0.542253    
## brandChrysler                -4.201e-01  1.131e-01  -3.713 0.000211 ***
## brandDodge                   -6.011e-02  8.399e-02  -0.716 0.474308    
## brandFord                     1.224e-02  7.147e-02   0.171 0.864000    
## brandGenesis                 -1.288e-01  1.666e-01  -0.773 0.439515    
## brandGMC                      9.710e-02  8.411e-02   1.154 0.248480    
## brandHonda                    2.552e-01  9.418e-02   2.710 0.006795 ** 
## brandHummer                   6.193e-01  1.460e-01   4.241 2.34e-05 ***
## brandHyundai                 -2.936e-01  8.748e-02  -3.357 0.000805 ***
## brandINFINITI                -1.822e-01  8.772e-02  -2.077 0.037930 *  
## brandJaguar                  -7.319e-02  9.391e-02  -0.779 0.435874    
## brandJeep                     2.597e-01  7.943e-02   3.269 0.001100 ** 
## brandKia                     -6.039e-02  9.547e-02  -0.633 0.527109    
## brandLand                     2.110e-01  7.915e-02   2.666 0.007740 ** 
## brandLexus                    2.859e-01  7.574e-02   3.775 0.000165 ***
## brandLincoln                 -7.234e-02  9.302e-02  -0.778 0.436850    
## brandMaserati                 2.324e-01  1.135e-01   2.048 0.040698 *  
## brandMazda                    1.446e-01  1.083e-01   1.335 0.182157    
## brandMercedes-Benz            1.598e-01  7.180e-02   2.226 0.026157 *  
## brandMINI                    -1.591e-01  1.103e-01  -1.441 0.149629    
## brandMitsubishi               1.407e-01  1.189e-01   1.184 0.236723    
## brandNissan                  -2.689e-02  8.011e-02  -0.336 0.737113    
## brandOther                    4.284e-01  8.940e-02   4.792 1.79e-06 ***
## brandPontiac                  5.045e-02  1.321e-01   0.382 0.702621    
## brandPorsche                  6.134e-01  7.638e-02   8.031 1.75e-15 ***
## brandRAM                     -1.224e-02  9.631e-02  -0.127 0.898891    
## brandSubaru                   1.597e-01  8.830e-02   1.809 0.070636 .  
## brandToyota                   2.773e-01  7.467e-02   3.713 0.000211 ***
## brandVolkswagen               7.286e-03  9.429e-02   0.077 0.938418    
## brandVolvo                   -2.965e-02  1.152e-01  -0.257 0.796947    
## model_year                    4.687e-02  2.142e-03  21.880  < 2e-16 ***
## mileage                      -5.586e-06  2.032e-07 -27.488  < 2e-16 ***
## fuel_typeE85 Flex Fuel       -6.114e-01  6.229e-02  -9.816  < 2e-16 ***
## fuel_typeGasoline            -6.473e-01  4.980e-02 -12.997  < 2e-16 ***
## fuel_typeHybrid              -5.667e-01  6.086e-02  -9.313  < 2e-16 ***
## accidentNone reported         6.633e-02  1.841e-02   3.604 0.000323 ***
## cylinders                     2.293e-02  1.034e-02   2.218 0.026692 *  
## horsepower                    2.794e-03  1.461e-04  19.119  < 2e-16 ***
## transmission_typeA/T and M/T  2.407e-02  2.503e-02   0.962 0.336338    
## transmission_typeCVT         -3.885e-02  5.689e-02  -0.683 0.494797    
## transmission_typeM/T          1.183e-01  2.969e-02   3.983 7.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3405 on 1762 degrees of freedom
## Multiple R-squared:  0.8301, Adjusted R-squared:  0.8257 
## F-statistic: 187.2 on 46 and 1762 DF,  p-value: < 2.2e-16
summary(log_model)
## 
## Call:
## lm(formula = price ~ brand + model_year + log(mileage) + fuel_type + 
##     accident + cylinders + horsepower + transmission_type, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6134 -0.1856  0.0184  0.2007  4.7705 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -9.695e+01  4.555e+00 -21.283  < 2e-16 ***
## brandAlfa                     3.846e-02  1.646e-01   0.234 0.815311    
## brandAudi                     2.133e-01  7.983e-02   2.672 0.007614 ** 
## brandBMW                      1.696e-01  7.521e-02   2.255 0.024277 *  
## brandBuick                   -2.372e-01  1.462e-01  -1.622 0.104916    
## brandCadillac                 8.850e-02  8.759e-02   1.010 0.312499    
## brandChevrolet                8.561e-02  7.779e-02   1.101 0.271226    
## brandChrysler                -3.176e-01  1.204e-01  -2.638 0.008412 ** 
## brandDodge                   -4.905e-02  8.947e-02  -0.548 0.583552    
## brandFord                     7.249e-02  7.609e-02   0.953 0.340845    
## brandGenesis                 -4.678e-02  1.774e-01  -0.264 0.792004    
## brandGMC                      1.344e-01  8.959e-02   1.501 0.133654    
## brandHonda                    2.370e-01  1.003e-01   2.363 0.018242 *  
## brandHummer                   7.193e-01  1.554e-01   4.628 3.97e-06 ***
## brandHyundai                 -2.704e-01  9.316e-02  -2.902 0.003753 ** 
## brandINFINITI                -9.056e-02  9.344e-02  -0.969 0.332580    
## brandJaguar                   1.015e-01  9.961e-02   1.019 0.308267    
## brandJeep                     3.040e-01  8.457e-02   3.595 0.000333 ***
## brandKia                      2.115e-03  1.016e-01   0.021 0.983398    
## brandLand                     2.835e-01  8.432e-02   3.362 0.000790 ***
## brandLexus                    3.311e-01  8.063e-02   4.106 4.20e-05 ***
## brandLincoln                  1.712e-02  9.909e-02   0.173 0.862837    
## brandMaserati                 3.919e-01  1.205e-01   3.252 0.001167 ** 
## brandMazda                    1.811e-01  1.153e-01   1.570 0.116583    
## brandMercedes-Benz            2.870e-01  7.625e-02   3.765 0.000172 ***
## brandMINI                    -8.927e-02  1.175e-01  -0.760 0.447572    
## brandMitsubishi               1.613e-01  1.266e-01   1.274 0.202755    
## brandNissan                   3.945e-02  8.528e-02   0.463 0.643724    
## brandOther                    5.211e-01  9.503e-02   5.483 4.78e-08 ***
## brandPontiac                  2.093e-01  1.404e-01   1.491 0.136194    
## brandPorsche                  7.430e-01  8.108e-02   9.164  < 2e-16 ***
## brandRAM                      1.074e-01  1.027e-01   1.046 0.295788    
## brandSubaru                   2.086e-01  9.406e-02   2.217 0.026740 *  
## brandToyota                   2.313e-01  7.956e-02   2.908 0.003688 ** 
## brandVolkswagen               8.775e-02  1.003e-01   0.875 0.381965    
## brandVolvo                    4.810e-02  1.228e-01   0.392 0.695334    
## model_year                    5.395e-02  2.233e-03  24.161  < 2e-16 ***
## log(mileage)                 -2.173e-01  1.016e-02 -21.379  < 2e-16 ***
## fuel_typeE85 Flex Fuel       -5.244e-01  6.654e-02  -7.881 5.65e-15 ***
## fuel_typeGasoline            -5.379e-01  5.290e-02 -10.168  < 2e-16 ***
## fuel_typeHybrid              -5.164e-01  6.473e-02  -7.977 2.67e-15 ***
## accidentNone reported         8.393e-02  1.959e-02   4.284 1.94e-05 ***
## cylinders                     1.968e-02  1.101e-02   1.788 0.073972 .  
## horsepower                    2.785e-03  1.562e-04  17.823  < 2e-16 ***
## transmission_typeA/T and M/T  4.616e-02  2.665e-02   1.732 0.083392 .  
## transmission_typeCVT          4.728e-03  6.062e-02   0.078 0.937835    
## transmission_typeM/T          1.611e-01  3.149e-02   5.115 3.47e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3627 on 1762 degrees of freedom
## Multiple R-squared:  0.8073, Adjusted R-squared:  0.8023 
## F-statistic: 160.5 on 46 and 1762 DF,  p-value: < 2.2e-16
# Compare AIC
AIC(backward_model, log_model)
##                df      AIC
## backward_model 48 1284.065
## log_model      48 1512.403
# Diagnostic plots for the log model
par(mfrow = c(2, 2))
plot(log_model)

The backward selection model using untransformed mileage performed better overall. It had a higher adjusted R² (0.826 vs. 0.802), a lower residual standard error (0.340 vs. 0.363), and a much lower AIC (1284 vs. 1512), indicating a better fit. Diagnostic plots also showed no meaningful improvement with the log-transformed model.

# Fit model with reasonable interaction terms
interaction_model <- lm(price ~ brand + model_year + mileage + fuel_type + accident + cylinders + horsepower + transmission_type + brand:transmission_type + model_year:mileage + horsepower:transmission_type, data = train)

# Summary of the model
summary(interaction_model)
## 
## Call:
## lm(formula = price ~ brand + model_year + mileage + fuel_type + 
##     accident + cylinders + horsepower + transmission_type + brand:transmission_type + 
##     model_year:mileage + horsepower:transmission_type, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6967 -0.1673  0.0000  0.1811  4.0788 
## 
## Coefficients: (43 not defined because of singularities)
##                                                   Estimate Std. Error t value
## (Intercept)                                     -1.097e+02  5.455e+00 -20.106
## brandAlfa                                       -2.648e-02  1.635e-01  -0.162
## brandAudi                                        1.708e-01  8.233e-02   2.074
## brandBMW                                         6.873e-02  7.815e-02   0.879
## brandBuick                                      -2.728e-01  1.427e-01  -1.913
## brandCadillac                                    5.099e-02  8.701e-02   0.586
## brandChevrolet                                   1.373e-01  7.908e-02   1.736
## brandChrysler                                   -3.612e-01  1.162e-01  -3.109
## brandDodge                                      -6.941e-03  8.989e-02  -0.077
## brandFord                                        1.074e-01  7.736e-02   1.388
## brandGenesis                                    -1.012e-01  1.789e-01  -0.566
## brandGMC                                         1.326e-01  8.720e-02   1.521
## brandHonda                                       9.673e-02  1.128e-01   0.858
## brandHummer                                      6.429e-01  1.440e-01   4.463
## brandHyundai                                    -1.534e-01  9.612e-02  -1.596
## brandINFINITI                                   -8.373e-02  1.013e-01  -0.827
## brandJaguar                                     -2.840e-02  9.805e-02  -0.290
## brandJeep                                        2.526e-01  8.604e-02   2.935
## brandKia                                        -2.742e-02  9.914e-02  -0.277
## brandLand                                        2.709e-01  8.463e-02   3.201
## brandLexus                                       3.209e-01  8.142e-02   3.941
## brandLincoln                                    -4.633e-02  9.999e-02  -0.463
## brandMaserati                                   -7.238e-02  1.313e-01  -0.551
## brandMazda                                       1.261e-01  1.628e-01   0.775
## brandMercedes-Benz                               2.105e-01  7.735e-02   2.722
## brandMINI                                       -1.053e-01  1.362e-01  -0.773
## brandMitsubishi                                 -1.216e-01  1.629e-01  -0.747
## brandNissan                                     -3.840e-02  9.040e-02  -0.425
## brandOther                                       4.865e-01  1.032e-01   4.716
## brandPontiac                                     1.424e-01  1.522e-01   0.935
## brandPorsche                                     6.565e-01  8.738e-02   7.514
## brandRAM                                         5.347e-02  9.916e-02   0.539
## brandSubaru                                      1.746e-01  1.361e-01   1.283
## brandToyota                                      3.091e-01  8.059e-02   3.836
## brandVolkswagen                                  2.101e-01  1.187e-01   1.770
## brandVolvo                                       2.212e-02  1.217e-01   0.182
## model_year                                       5.935e-02  2.701e-03  21.974
## mileage                                          3.013e-04  4.789e-05   6.292
## fuel_typeE85 Flex Fuel                          -5.724e-01  6.062e-02  -9.442
## fuel_typeGasoline                               -6.031e-01  4.850e-02 -12.436
## fuel_typeHybrid                                 -5.623e-01  6.036e-02  -9.315
## accidentNone reported                            5.322e-02  1.808e-02   2.944
## cylinders                                        2.978e-02  1.022e-02   2.913
## horsepower                                       2.747e-03  1.555e-04  17.663
## transmission_typeA/T and M/T                     4.002e-01  2.098e-01   1.907
## transmission_typeCVT                             1.671e-01  3.056e-01   0.547
## transmission_typeM/T                             2.185e-01  2.175e-01   1.004
## brandAlfa:transmission_typeA/T and M/T          -1.698e-01  4.113e-01  -0.413
## brandAudi:transmission_typeA/T and M/T          -2.699e-01  2.190e-01  -1.233
## brandBMW:transmission_typeA/T and M/T           -1.522e-01  2.116e-01  -0.719
## brandBuick:transmission_typeA/T and M/T         -1.291e-01  4.060e-01  -0.318
## brandCadillac:transmission_typeA/T and M/T       4.074e-02  2.678e-01   0.152
## brandChevrolet:transmission_typeA/T and M/T     -2.760e-01  2.208e-01  -1.250
## brandChrysler:transmission_typeA/T and M/T      -1.688e-01  3.951e-01  -0.427
## brandDodge:transmission_typeA/T and M/T         -2.997e-01  2.712e-01  -1.105
## brandFord:transmission_typeA/T and M/T          -4.021e-01  2.351e-01  -1.710
## brandGenesis:transmission_typeA/T and M/T       -3.245e-01  4.189e-01  -0.775
## brandGMC:transmission_typeA/T and M/T                   NA         NA      NA
## brandHonda:transmission_typeA/T and M/T         -3.676e-01  3.949e-01  -0.931
## brandHummer:transmission_typeA/T and M/T                NA         NA      NA
## brandHyundai:transmission_typeA/T and M/T       -4.269e-01  2.581e-01  -1.654
## brandINFINITI:transmission_typeA/T and M/T      -3.740e-01  2.415e-01  -1.549
## brandJaguar:transmission_typeA/T and M/T        -1.616e-02  3.144e-01  -0.051
## brandJeep:transmission_typeA/T and M/T          -1.201e-01  3.118e-01  -0.385
## brandKia:transmission_typeA/T and M/T                   NA         NA      NA
## brandLand:transmission_typeA/T and M/T          -2.445e-01  2.382e-01  -1.026
## brandLexus:transmission_typeA/T and M/T         -2.234e-01  2.281e-01  -0.979
## brandLincoln:transmission_typeA/T and M/T       -2.155e-02  2.594e-01  -0.083
## brandMaserati:transmission_typeA/T and M/T       8.074e-01  2.744e-01   2.942
## brandMazda:transmission_typeA/T and M/T         -1.610e-01  2.982e-01  -0.540
## brandMercedes-Benz:transmission_typeA/T and M/T -1.455e-01  2.176e-01  -0.669
## brandMINI:transmission_typeA/T and M/T           9.531e-02  4.011e-01   0.238
## brandMitsubishi:transmission_typeA/T and M/T     5.549e-01  4.113e-01   1.349
## brandNissan:transmission_typeA/T and M/T         3.641e-01  2.479e-01   1.469
## brandOther:transmission_typeA/T and M/T          2.131e-01  2.591e-01   0.822
## brandPontiac:transmission_typeA/T and M/T       -3.841e-01  4.073e-01  -0.943
## brandPorsche:transmission_typeA/T and M/T       -3.207e-01  2.226e-01  -1.441
## brandRAM:transmission_typeA/T and M/T           -9.270e-02  3.918e-01  -0.237
## brandSubaru:transmission_typeA/T and M/T        -2.709e-01  3.002e-01  -0.902
## brandToyota:transmission_typeA/T and M/T        -2.946e-01  2.524e-01  -1.167
## brandVolkswagen:transmission_typeA/T and M/T    -5.445e-01  2.595e-01  -2.098
## brandVolvo:transmission_typeA/T and M/T         -9.981e-02  3.232e-01  -0.309
## brandAlfa:transmission_typeCVT                          NA         NA      NA
## brandAudi:transmission_typeCVT                          NA         NA      NA
## brandBMW:transmission_typeCVT                           NA         NA      NA
## brandBuick:transmission_typeCVT                         NA         NA      NA
## brandCadillac:transmission_typeCVT                      NA         NA      NA
## brandChevrolet:transmission_typeCVT                     NA         NA      NA
## brandChrysler:transmission_typeCVT                      NA         NA      NA
## brandDodge:transmission_typeCVT                         NA         NA      NA
## brandFord:transmission_typeCVT                  -3.083e-02  2.145e-01  -0.144
## brandGenesis:transmission_typeCVT                       NA         NA      NA
## brandGMC:transmission_typeCVT                           NA         NA      NA
## brandHonda:transmission_typeCVT                  4.522e-01  2.613e-01   1.731
## brandHummer:transmission_typeCVT                        NA         NA      NA
## brandHyundai:transmission_typeCVT                       NA         NA      NA
## brandINFINITI:transmission_typeCVT               2.810e-01  3.814e-01   0.737
## brandJaguar:transmission_typeCVT                        NA         NA      NA
## brandJeep:transmission_typeCVT                          NA         NA      NA
## brandKia:transmission_typeCVT                   -3.075e-02  3.105e-01  -0.099
## brandLand:transmission_typeCVT                          NA         NA      NA
## brandLexus:transmission_typeCVT                  2.275e-01  2.370e-01   0.960
## brandLincoln:transmission_typeCVT                       NA         NA      NA
## brandMaserati:transmission_typeCVT                      NA         NA      NA
## brandMazda:transmission_typeCVT                         NA         NA      NA
## brandMercedes-Benz:transmission_typeCVT                 NA         NA      NA
## brandMINI:transmission_typeCVT                          NA         NA      NA
## brandMitsubishi:transmission_typeCVT             1.432e-01  3.300e-01   0.434
## brandNissan:transmission_typeCVT                 6.764e-02  2.048e-01   0.330
## brandOther:transmission_typeCVT                         NA         NA      NA
## brandPontiac:transmission_typeCVT                       NA         NA      NA
## brandPorsche:transmission_typeCVT                       NA         NA      NA
## brandRAM:transmission_typeCVT                           NA         NA      NA
## brandSubaru:transmission_typeCVT                 1.280e-01  2.350e-01   0.545
## brandToyota:transmission_typeCVT                        NA         NA      NA
## brandVolkswagen:transmission_typeCVT                    NA         NA      NA
## brandVolvo:transmission_typeCVT                         NA         NA      NA
## brandAlfa:transmission_typeM/T                          NA         NA      NA
## brandAudi:transmission_typeM/T                  -1.467e-01  2.477e-01  -0.592
## brandBMW:transmission_typeM/T                    9.741e-02  2.165e-01   0.450
## brandBuick:transmission_typeM/T                         NA         NA      NA
## brandCadillac:transmission_typeM/T              -3.027e-01  3.912e-01  -0.774
## brandChevrolet:transmission_typeM/T             -5.403e-01  2.300e-01  -2.349
## brandChrysler:transmission_typeM/T                      NA         NA      NA
## brandDodge:transmission_typeM/T                 -4.212e-01  2.818e-01  -1.495
## brandFord:transmission_typeM/T                  -3.881e-01  2.218e-01  -1.749
## brandGenesis:transmission_typeM/T                       NA         NA      NA
## brandGMC:transmission_typeM/T                           NA         NA      NA
## brandHonda:transmission_typeM/T                  4.106e-01  2.522e-01   1.628
## brandHummer:transmission_typeM/T                        NA         NA      NA
## brandHyundai:transmission_typeM/T               -6.043e-01  2.675e-01  -2.259
## brandINFINITI:transmission_typeM/T              -6.771e-02  2.703e-01  -0.251
## brandJaguar:transmission_typeM/T                        NA         NA      NA
## brandJeep:transmission_typeM/T                   7.678e-02  2.295e-01   0.334
## brandKia:transmission_typeM/T                           NA         NA      NA
## brandLand:transmission_typeM/T                          NA         NA      NA
## brandLexus:transmission_typeM/T                 -1.832e-01  3.860e-01  -0.475
## brandLincoln:transmission_typeM/T                       NA         NA      NA
## brandMaserati:transmission_typeM/T                      NA         NA      NA
## brandMazda:transmission_typeM/T                 -1.183e-02  2.787e-01  -0.042
## brandMercedes-Benz:transmission_typeM/T         -7.469e-01  3.852e-01  -1.939
## brandMINI:transmission_typeM/T                  -1.507e-01  2.691e-01  -0.560
## brandMitsubishi:transmission_typeM/T             5.205e-01  2.985e-01   1.744
## brandNissan:transmission_typeM/T                -1.260e-01  2.483e-01  -0.508
## brandOther:transmission_typeM/T                 -4.339e-01  2.482e-01  -1.748
## brandPontiac:transmission_typeM/T               -1.120e-01  3.350e-01  -0.334
## brandPorsche:transmission_typeM/T                1.678e-02  2.236e-01   0.075
## brandRAM:transmission_typeM/T                           NA         NA      NA
## brandSubaru:transmission_typeM/T                -1.069e-01  2.468e-01  -0.433
## brandToyota:transmission_typeM/T                -1.501e-01  2.358e-01  -0.636
## brandVolkswagen:transmission_typeM/T            -4.318e-01  2.512e-01  -1.719
## brandVolvo:transmission_typeM/T                         NA         NA      NA
## model_year:mileage                              -1.526e-07  2.380e-08  -6.410
## horsepower:transmission_typeA/T and M/T         -5.038e-04  2.322e-04  -2.170
## horsepower:transmission_typeCVT                 -1.249e-03  1.085e-03  -1.152
## horsepower:transmission_typeM/T                  1.844e-04  3.441e-04   0.536
##                                                 Pr(>|t|)    
## (Intercept)                                      < 2e-16 ***
## brandAlfa                                        0.87136    
## brandAudi                                        0.03819 *  
## brandBMW                                         0.37931    
## brandBuick                                       0.05596 .  
## brandCadillac                                    0.55793    
## brandChevrolet                                   0.08271 .  
## brandChrysler                                    0.00191 ** 
## brandDodge                                       0.93846    
## brandFord                                        0.16538    
## brandGenesis                                     0.57178    
## brandGMC                                         0.12845    
## brandHonda                                       0.39108    
## brandHummer                                     8.62e-06 ***
## brandHyundai                                     0.11063    
## brandINFINITI                                    0.40848    
## brandJaguar                                      0.77211    
## brandJeep                                        0.00338 ** 
## brandKia                                         0.78214    
## brandLand                                        0.00140 ** 
## brandLexus                                      8.44e-05 ***
## brandLincoln                                     0.64320    
## brandMaserati                                    0.58141    
## brandMazda                                       0.43869    
## brandMercedes-Benz                               0.00655 ** 
## brandMINI                                        0.43957    
## brandMitsubishi                                  0.45542    
## brandNissan                                      0.67104    
## brandOther                                      2.61e-06 ***
## brandPontiac                                     0.34973    
## brandPorsche                                    9.25e-14 ***
## brandRAM                                         0.58980    
## brandSubaru                                      0.19963    
## brandToyota                                      0.00013 ***
## brandVolkswagen                                  0.07685 .  
## brandVolvo                                       0.85579    
## model_year                                       < 2e-16 ***
## mileage                                         3.99e-10 ***
## fuel_typeE85 Flex Fuel                           < 2e-16 ***
## fuel_typeGasoline                                < 2e-16 ***
## fuel_typeHybrid                                  < 2e-16 ***
## accidentNone reported                            0.00328 ** 
## cylinders                                        0.00362 ** 
## horsepower                                       < 2e-16 ***
## transmission_typeA/T and M/T                     0.05667 .  
## transmission_typeCVT                             0.58462    
## transmission_typeM/T                             0.31537    
## brandAlfa:transmission_typeA/T and M/T           0.67979    
## brandAudi:transmission_typeA/T and M/T           0.21791    
## brandBMW:transmission_typeA/T and M/T            0.47195    
## brandBuick:transmission_typeA/T and M/T          0.75055    
## brandCadillac:transmission_typeA/T and M/T       0.87911    
## brandChevrolet:transmission_typeA/T and M/T      0.21145    
## brandChrysler:transmission_typeA/T and M/T       0.66919    
## brandDodge:transmission_typeA/T and M/T          0.26929    
## brandFord:transmission_typeA/T and M/T           0.08737 .  
## brandGenesis:transmission_typeA/T and M/T        0.43872    
## brandGMC:transmission_typeA/T and M/T                 NA    
## brandHonda:transmission_typeA/T and M/T          0.35205    
## brandHummer:transmission_typeA/T and M/T              NA    
## brandHyundai:transmission_typeA/T and M/T        0.09827 .  
## brandINFINITI:transmission_typeA/T and M/T       0.12157    
## brandJaguar:transmission_typeA/T and M/T         0.95902    
## brandJeep:transmission_typeA/T and M/T           0.70023    
## brandKia:transmission_typeA/T and M/T                 NA    
## brandLand:transmission_typeA/T and M/T           0.30485    
## brandLexus:transmission_typeA/T and M/T          0.32764    
## brandLincoln:transmission_typeA/T and M/T        0.93381    
## brandMaserati:transmission_typeA/T and M/T       0.00330 ** 
## brandMazda:transmission_typeA/T and M/T          0.58944    
## brandMercedes-Benz:transmission_typeA/T and M/T  0.50375    
## brandMINI:transmission_typeA/T and M/T           0.81220    
## brandMitsubishi:transmission_typeA/T and M/T     0.17750    
## brandNissan:transmission_typeA/T and M/T         0.14207    
## brandOther:transmission_typeA/T and M/T          0.41095    
## brandPontiac:transmission_typeA/T and M/T        0.34581    
## brandPorsche:transmission_typeA/T and M/T        0.14990    
## brandRAM:transmission_typeA/T and M/T            0.81301    
## brandSubaru:transmission_typeA/T and M/T         0.36695    
## brandToyota:transmission_typeA/T and M/T         0.24323    
## brandVolkswagen:transmission_typeA/T and M/T     0.03606 *  
## brandVolvo:transmission_typeA/T and M/T          0.75748    
## brandAlfa:transmission_typeCVT                        NA    
## brandAudi:transmission_typeCVT                        NA    
## brandBMW:transmission_typeCVT                         NA    
## brandBuick:transmission_typeCVT                       NA    
## brandCadillac:transmission_typeCVT                    NA    
## brandChevrolet:transmission_typeCVT                   NA    
## brandChrysler:transmission_typeCVT                    NA    
## brandDodge:transmission_typeCVT                       NA    
## brandFord:transmission_typeCVT                   0.88571    
## brandGenesis:transmission_typeCVT                     NA    
## brandGMC:transmission_typeCVT                         NA    
## brandHonda:transmission_typeCVT                  0.08364 .  
## brandHummer:transmission_typeCVT                      NA    
## brandHyundai:transmission_typeCVT                     NA    
## brandINFINITI:transmission_typeCVT               0.46136    
## brandJaguar:transmission_typeCVT                      NA    
## brandJeep:transmission_typeCVT                        NA    
## brandKia:transmission_typeCVT                    0.92111    
## brandLand:transmission_typeCVT                        NA    
## brandLexus:transmission_typeCVT                  0.33732    
## brandLincoln:transmission_typeCVT                     NA    
## brandMaserati:transmission_typeCVT                    NA    
## brandMazda:transmission_typeCVT                       NA    
## brandMercedes-Benz:transmission_typeCVT               NA    
## brandMINI:transmission_typeCVT                        NA    
## brandMitsubishi:transmission_typeCVT             0.66434    
## brandNissan:transmission_typeCVT                 0.74125    
## brandOther:transmission_typeCVT                       NA    
## brandPontiac:transmission_typeCVT                     NA    
## brandPorsche:transmission_typeCVT                     NA    
## brandRAM:transmission_typeCVT                         NA    
## brandSubaru:transmission_typeCVT                 0.58610    
## brandToyota:transmission_typeCVT                      NA    
## brandVolkswagen:transmission_typeCVT                  NA    
## brandVolvo:transmission_typeCVT                       NA    
## brandAlfa:transmission_typeM/T                        NA    
## brandAudi:transmission_typeM/T                   0.55364    
## brandBMW:transmission_typeM/T                    0.65280    
## brandBuick:transmission_typeM/T                       NA    
## brandCadillac:transmission_typeM/T               0.43915    
## brandChevrolet:transmission_typeM/T              0.01896 *  
## brandChrysler:transmission_typeM/T                    NA    
## brandDodge:transmission_typeM/T                  0.13510    
## brandFord:transmission_typeM/T                   0.08039 .  
## brandGenesis:transmission_typeM/T                     NA    
## brandGMC:transmission_typeM/T                         NA    
## brandHonda:transmission_typeM/T                  0.10369    
## brandHummer:transmission_typeM/T                      NA    
## brandHyundai:transmission_typeM/T                0.02400 *  
## brandINFINITI:transmission_typeM/T               0.80222    
## brandJaguar:transmission_typeM/T                      NA    
## brandJeep:transmission_typeM/T                   0.73805    
## brandKia:transmission_typeM/T                         NA    
## brandLand:transmission_typeM/T                        NA    
## brandLexus:transmission_typeM/T                  0.63512    
## brandLincoln:transmission_typeM/T                     NA    
## brandMaserati:transmission_typeM/T                    NA    
## brandMazda:transmission_typeM/T                  0.96616    
## brandMercedes-Benz:transmission_typeM/T          0.05267 .  
## brandMINI:transmission_typeM/T                   0.57541    
## brandMitsubishi:transmission_typeM/T             0.08135 .  
## brandNissan:transmission_typeM/T                 0.61182    
## brandOther:transmission_typeM/T                  0.08068 .  
## brandPontiac:transmission_typeM/T                0.73818    
## brandPorsche:transmission_typeM/T                0.94018    
## brandRAM:transmission_typeM/T                         NA    
## brandSubaru:transmission_typeM/T                 0.66498    
## brandToyota:transmission_typeM/T                 0.52454    
## brandVolkswagen:transmission_typeM/T             0.08583 .  
## brandVolvo:transmission_typeM/T                       NA    
## model_year:mileage                              1.88e-10 ***
## horsepower:transmission_typeA/T and M/T          0.03014 *  
## horsepower:transmission_typeCVT                  0.24953    
## horsepower:transmission_typeM/T                  0.59209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3265 on 1696 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8398 
## F-statistic: 85.61 on 112 and 1696 DF,  p-value: < 2.2e-16
# Compare to previous model
AIC(backward_model, interaction_model)
##                    df      AIC
## backward_model     48 1284.065
## interaction_model 114 1194.811
# Plot diagnostics
par(mfrow = c(2, 2))
plot(interaction_model)
## Warning: not plotting observations with leverage one:
##   172, 189, 283, 382, 383, 699, 891, 1202, 1558, 1564, 1689, 1762
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Adding interaction terms between brand and transmission_type, as well as between model_year and mileage, and between horsepower and transmission_type, significantly improved model fit. The AIC decreased from 1284.1 to 1194.8, and Adjusted R² increased from 0.8257 to 0.8398, indicating a better balance of complexity and explanatory power.

Taking a look at generalizability:

# Load packages
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
# Set seed
set.seed(1361)

# Set up cross-validation with 10 folds
cv_control <- trainControl(method = "cv", number = 10)

# Define formula
interaction_formula <- price ~ brand + model_year + mileage + fuel_type + 
  accident + cylinders + horsepower + transmission_type + 
  brand:transmission_type + 
  model_year:mileage + 
  horsepower:transmission_type

# Fit model using 10-fold CV
cv_interaction_model <- train(
  interaction_formula,
  data = train,
  method = "lm",
  trControl = cv_control,
  metric = "RMSE"
)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
# Print cross-validation performance
print(cv_interaction_model)
## Linear Regression 
## 
## 1809 samples
##    8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1628, 1628, 1629, 1629, 1628, 1629, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.3429384  0.8235113  0.2461155
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

This is a strong baseline. The model generalizes well, but some overfitting may still be possible due to rank deficiency warnings and lots of interaction terms. Since I have a lot of predictors let’s try lasso:

# Load necessary libraries
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(caret)

# Prepare the data: create model matrix for Lasso (handles factor variables and interactions)
x <- model.matrix(price ~ brand + model_year + mileage + fuel_type + accident + 
                    cylinders + horsepower + transmission_type + 
                    brand:transmission_type + model_year:mileage + 
                    horsepower:transmission_type, data = train)[, -1] 
y <- train$price

# Set up cross-validation
set.seed(1361)
cv_lasso <- cv.glmnet(x, y, alpha = 1, nfolds = 10)  # alpha = 1 for Lasso

# Best lambda value
best_lambda <- cv_lasso$lambda.min
cat("Best lambda:", best_lambda, "\n")
## Best lambda: 0.003191795
# Fit final model using best lambda
final_lasso <- glmnet(x, y, alpha = 1, lambda = best_lambda)

# Cross-validated RMSE and R²
preds <- predict(final_lasso, s = best_lambda, newx = x)
rmse_lasso <- sqrt(mean((y - preds)^2))
r2_lasso <- 1 - sum((y - preds)^2) / sum((y - mean(y))^2)
cat("Lasso Model — RMSE:", round(rmse_lasso, 4), " R²:", round(r2_lasso, 4), "\n")
## Lasso Model — RMSE: 0.3227  R²: 0.8433

Slight improvement. Lasso likely removed some non-informative interaction terms, reducing overfitting and improving predictive accuracy. This confirms your data benefits from regularization and variable selection, something OLS with interactions alone couldn’t optimally handle.

Now lets try random forests:

# Load necessary packages
library(caret)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
# Set seed
set.seed(1361)

# Define cross-validation settings
cv_control <- trainControl(method = "cv", number = 10)

# Fit Random Forest using all predictors (no manual interactions needed)
rf_model <- train(
  price ~ ., 
  data = train,
  method = "rf",
  trControl = cv_control,
  metric = "RMSE",
  importance = TRUE
)

# Print results
print(rf_model)
## Random Forest 
## 
## 1809 samples
##   12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1628, 1628, 1629, 1629, 1628, 1629, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##    2    0.5142615  0.7917171  0.3867407
##   34    0.3171443  0.8498573  0.2251541
##   67    0.3227040  0.8436322  0.2296027
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 34.
# Plot variable importance (optional)
varImpPlot(rf_model$finalModel)

Slightly better than lasso. The Random Forest model achieved the best performance with an RMSE of 0.3171 and R² of 0.8499, outperforming all previous models, including Lasso. This suggests it generalizes best and should possibly be used for final price predictions. It captures the underlying slightly curved relationships more, but for the sake of explaining in a simple way to a CEO about how to help his business, the Lasso model might still be better for understanding more about individual variables. It also only performs very slightly worse.

Lets try one more middle ground solution in GAMS, where we can get more predictive accuracy while keeping the generalizability we want. I’m going to compare Gams on both the backward selected interaction model and the original full model. I will only apply transformations to features here that intuitively and graphically need more flexibility to help capture them.

# Load libraries
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(splines)
library(dplyr)
library(cvTools)
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:boot':
## 
##     salinity
## 
## Attaching package: 'cvTools'
## The following object is masked from 'package:Metrics':
## 
##     mape
# Clean up factor level names for GAM compatibility
train_fixed <- train
train_fixed$brand <- make.names(train_fixed$brand)
train_fixed$fuel_type <- make.names(train_fixed$fuel_type)
train_fixed$accident <- make.names(train_fixed$accident)
train_fixed$transmission_type <- make.names(train_fixed$transmission_type)

# Fit full GAM with splines on continuous vars
gam_full <- gam(
  price ~ brand + fuel_type + accident + cylinders + transmission_type +
    s(model_year, k = 4) + s(mileage, k = 4) + s(horsepower, k = 4) + s(liters, k = 4),
  data = train_fixed,
  method = "REML"
)

# 10-Fold CV on Full GAM

set.seed(1361)
folds <- cvFolds(nrow(train_fixed), K = 10)
rmse_full <- numeric(10)

for (i in 1:10) {
  train_fold <- train_fixed[folds$subsets[folds$which != i], ]
  valid_fold <- train_fixed[folds$subsets[folds$which == i], ]
  
  fold_model <- gam(
    price ~ brand + fuel_type + accident + cylinders + transmission_type +
      s(model_year, k = 4) + s(mileage, k = 4) + s(horsepower, k = 4) + s(liters, k = 4),
    data = train_fold,
    method = "REML"
  )
  
  preds <- predict(fold_model, newdata = valid_fold)
  rmse_full[i] <- sqrt(mean((valid_fold$price - preds)^2))
}

cat("10-Fold CV RMSE — Full GAM Model:", round(mean(rmse_full), 4), "\n")
## 10-Fold CV RMSE — Full GAM Model: 0.3234
# GAM - BACKWARD INTERACTION MODEL
gam_backward <- gam(
  price ~ brand + model_year + mileage + fuel_type + accident +
    cylinders + horsepower + transmission_type,
  data = train_fixed,
  method = "REML"
)

# 10-Fold CV on Backward GAM

set.seed(1361)
rmse_back <- numeric(10)

for (i in 1:10) {
  train_fold <- train_fixed[folds$subsets[folds$which != i], ]
  valid_fold <- train_fixed[folds$subsets[folds$which == i], ]
  
  fold_model <- gam(
    price ~ brand + model_year + mileage + fuel_type + accident +
      cylinders + horsepower + transmission_type,
    data = train_fold,
    method = "REML"
  )
  
  preds <- predict(fold_model, newdata = valid_fold)
  rmse_back[i] <- sqrt(mean((valid_fold$price - preds)^2))
}

cat("10-Fold CV RMSE — Backward GAM Model:", round(mean(rmse_back), 4), "\n")
## 10-Fold CV RMSE — Backward GAM Model: 0.3448

Based on this I ultimately would choose the Full GAM model because it offers the best trade-off between interpretability and strong predictive performance. Although the Random Forest had the lowest RMSE, it’s a black-box model that doesn’t provide clear explanations for the CEO. The Backward Interaction Lasso was slightly better than the Full GAM in terms of RMSE, but it’s harder to interpret due to its large number of interactions. The Full GAM performs nearly as well, while clearly showing how variables like mileage, horsepower, and accident history affect price — making it the most useful model for actionable business insights. i feel good about this because from an intuitive standpoint, all of the variables in the dataset can influence decisions from consumers buying cars.

Lets finalize our predictions for our Full GAMS model now:

# Load libraries
library(mgcv)
library(splines)
library(tidyverse)

# Clean problematic factor levels in both train and test sets
train_fixed <- train
test_fixed <- test

train_fixed$brand <- make.names(train_fixed$brand)
train_fixed$fuel_type <- make.names(train_fixed$fuel_type)
train_fixed$accident <- make.names(train_fixed$accident)
train_fixed$transmission_type <- make.names(train_fixed$transmission_type)

test_fixed$brand <- make.names(test_fixed$brand)
test_fixed$fuel_type <- make.names(test_fixed$fuel_type)
test_fixed$accident <- make.names(test_fixed$accident)
test_fixed$transmission_type <- make.names(test_fixed$transmission_type)

# Fit the Full GAM model to the full training data
final_gam <- gam(
  price ~ brand + fuel_type + accident + cylinders + transmission_type +
    ns(model_year, 3) + ns(mileage, 3) + ns(horsepower, 3) + ns(liters, 3),
  data = train_fixed
)

# Predict log prices on the test set
gam_preds <- predict(final_gam, newdata = test_fixed)

# Create dataframe for submission
final_predictions <- data.frame(
  id = test$id,
  price = as.vector(gam_preds)  # log(price)
)

# Save predictions to CSV
write.csv(final_predictions, "testing_predictions_Friedman_Graham_GDF19.csv", row.names = FALSE)

Lets also make sure that influential points first don’t play a massive role first. I am inclined to leave them in an assume that they are not just data entries and trust the business, so lets at least inspect what they imply. I did some approximation because influential is much more throroughly explained in linear models.

# === Load Required Package ===
library(mgcv)

# === Step 1: Compute Linear Predictor Matrix ===
X <- predict(final_gam, type = "lpmatrix")  # Design matrix for the GAM

# === Step 2: Compute Hat Matrix Approximation ===
H <- X %*% solve(t(X) %*% X) %*% t(X)

# === Step 3: Extract Leverage Values ===
leverage_vals <- diag(H)

# === Step 4: Plot Leverage Values ===
plot(leverage_vals, type = "h",
     main = "Approximate Leverage Values from GAM",
     ylab = "Leverage", xlab = "Observation Index")
abline(h = 2 * mean(leverage_vals), col = "red", lty = 2)

# === Step 5: Identify Top 5 Highest Leverage Points ===
top_leverage_idx <- order(leverage_vals, decreasing = TRUE)[1:5]
top_leverage_vals <- leverage_vals[top_leverage_idx]
top_leverage_obs <- train_fixed[top_leverage_idx, ]  # or `train` if not using cleaned version

# View top leverage values and rows
top_leverage_idx
## [1] 1200  130  371  227   31
top_leverage_vals
##      1200       130       371       227        31 
## 0.2986871 0.2083296 0.2029099 0.2018064 0.2015757
top_leverage_obs
## # A tibble: 5 × 13
##      id price brand   model_year mileage fuel_type ext_col int_col accident     
##   <dbl> <dbl> <chr>        <dbl>   <dbl> <chr>     <chr>   <chr>   <chr>        
## 1  1889  8.70 Honda         2013  405000 Gasoline  Black   Black   None.reported
## 2   209 10.2  Genesis       2017   63500 Gasoline  Silver  Black   None.reported
## 3   584 10.4  Genesis       2020   23600 Gasoline  Blue    Gray    None.reported
## 4   349 10.6  Genesis       2022   12000 Gasoline  Green   Beige   None.reported
## 5    49 10.9  Genesis       2021    9950 Gasoline  Silver  Blue    None.reported
## # ℹ 4 more variables: cylinders <dbl>, horsepower <dbl>, liters <dbl>,
## #   transmission_type <chr>

Extreme Mileage Outlier (Honda) Observation 1200 is a 2013 Honda with 405,000 miles, far beyond all other cars in the dataset. Its log price is very low (≈ 8.7), consistent with such high mileage. This alignment suggests the point is accurate, not a data error, and reflects real market depreciation. While it heavily influences the mileage spline, it provides meaningful information and was kept in the model.

Genesis Vehicles Anchoring the Price Curve Four high-leverage points are low-mileage, newer Genesis cars from 2017–2022. They have high log prices (10.2 to 10.9) and likely define the upper trend in the model. These are not problematic, but it’s important to recognize that they shape the spline fit at the high end.

All high-leverage points reflect genuine data patterns, not anomalies. So I wont ommit them from the model. Their inclusion supports model flexibility and generalizability, rather than undermining it.

What does the final model tell us that simple charts and other models wouldn’t be able to?

# Load libraries
library(mgcv)
library(ggplot2)
library(dplyr)

# Refit final GAM model using smooth terms for interpretability and plotting
final_gam <- gam(
  price ~ brand + fuel_type + accident + cylinders + transmission_type +
    s(model_year, k = 3) + s(mileage, k = 3) + s(horsepower, k = 3) + s(liters, k = 3),
  data = train_fixed
)

# Non-Linear Effects with Confidence Bands
plot(final_gam, pages = 1, se = TRUE, shade = TRUE,
     main = "Smoothed Effects of Continuous Predictors")

# Model Summary with Term Significance 
summary(final_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## price ~ brand + fuel_type + accident + cylinders + transmission_type + 
##     s(model_year, k = 3) + s(mileage, k = 3) + s(horsepower, 
##     k = 3) + s(liters, k = 3)
## 
## Parametric coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  10.6022369  0.1154804  91.810  < 2e-16 ***
## brandAlfa                    -0.1118431  0.1462355  -0.765 0.444484    
## brandAudi                     0.1292032  0.0713558   1.811 0.070360 .  
## brandBMW                      0.0792310  0.0675701   1.173 0.241125    
## brandBuick                   -0.2744997  0.1296765  -2.117 0.034417 *  
## brandCadillac                 0.0227350  0.0780947   0.291 0.770994    
## brandChevrolet                0.0174024  0.0699240   0.249 0.803486    
## brandChrysler                -0.3937634  0.1067305  -3.689 0.000232 ***
## brandDodge                   -0.0745066  0.0802288  -0.929 0.353184    
## brandFord                    -0.0006973  0.0674420  -0.010 0.991751    
## brandGenesis                 -0.2360233  0.1573228  -1.500 0.133730    
## brandGMC                      0.0644659  0.0801951   0.804 0.421584    
## brandHonda                    0.2607171  0.0890388   2.928 0.003454 ** 
## brandHummer                   0.5864613  0.1392418   4.212 2.66e-05 ***
## brandHyundai                 -0.2410893  0.0826980  -2.915 0.003598 ** 
## brandINFINITI                -0.1532701  0.0829894  -1.847 0.064935 .  
## brandJaguar                  -0.0851007  0.0890272  -0.956 0.339257    
## brandJeep                     0.2461383  0.0753154   3.268 0.001104 ** 
## brandKia                     -0.0634179  0.0904917  -0.701 0.483511    
## brandLand                     0.2086562  0.0750826   2.779 0.005510 ** 
## brandLexus                    0.3032353  0.0714816   4.242 2.33e-05 ***
## brandLincoln                 -0.0833105  0.0880177  -0.947 0.344013    
## brandMaserati                 0.2263110  0.1079149   2.097 0.036125 *  
## brandMazda                    0.1431178  0.1026841   1.394 0.163564    
## brandMercedes.Benz            0.1969950  0.0680130   2.896 0.003821 ** 
## brandMINI                    -0.0410694  0.1049093  -0.391 0.695493    
## brandMitsubishi               0.1655032  0.1121254   1.476 0.140109    
## brandNissan                   0.0103168  0.0756329   0.136 0.891516    
## brandOther                    0.4942810  0.0854556   5.784 8.61e-09 ***
## brandPontiac                  0.0356053  0.1247446   0.285 0.775352    
## brandPorsche                  0.6161069  0.0724547   8.503  < 2e-16 ***
## brandRAM                     -0.0006839  0.0936773  -0.007 0.994176    
## brandSubaru                   0.1889412  0.0834577   2.264 0.023701 *  
## brandToyota                   0.2616763  0.0706386   3.704 0.000218 ***
## brandVolkswagen               0.0209950  0.0891241   0.236 0.813794    
## brandVolvo                   -0.0311678  0.1089188  -0.286 0.774792    
## fuel_typeE85.Flex.Fuel       -0.5005477  0.0618270  -8.096 1.05e-15 ***
## fuel_typeGasoline            -0.6034630  0.0514329 -11.733  < 2e-16 ***
## fuel_typeHybrid              -0.6091802  0.0617335  -9.868  < 2e-16 ***
## accidentNone.reported         0.0417343  0.0174560   2.391 0.016915 *  
## cylinders                    -0.0067309  0.0156857  -0.429 0.667894    
## transmission_typeA.T.and.M.T  0.0462754  0.0236700   1.955 0.050739 .  
## transmission_typeCVT         -0.0044708  0.0541444  -0.083 0.934201    
## transmission_typeM.T          0.0961925  0.0281394   3.418 0.000644 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df       F p-value    
## s(model_year) 1.984  2.000 254.379  <2e-16 ***
## s(mileage)    1.952  1.998 299.492  <2e-16 ***
## s(horsepower) 1.967  1.999 206.548  <2e-16 ***
## s(liters)     1.000  1.000   0.049   0.825    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.845   Deviance explained = 84.9%
## GCV = 0.10608  Scale est. = 0.1031    n = 1809
# Fitted vs. Actual Log Prices
train_fixed$predicted <- predict(final_gam)

ggplot(train_fixed, aes(x = predicted, y = price)) +
  geom_point(alpha = 0.3, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, color = "blue", linetype = "dashed") +
  labs(
    title = "Fitted vs Actual Log Price",
    x = "Predicted Log Price",
    y = "Actual Log Price"
  ) +
  theme_minimal()

# Residuals vs. Fitted Values 
ggplot(train_fixed, aes(x = predicted, y = residuals(final_gam))) +
  geom_point(alpha = 0.3, color = "darkred") +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  labs(
    title = "Residuals vs Fitted Values",
    x = "Fitted Values",
    y = "Residuals"
  ) +
  theme_minimal()

What This Model Shows That Other Models or Simple Plots Don’t Non-linear trends are captured and visualized. GAM uncovers curves for mileage, model year, and horsepower — which simple models would have to oversimplify. These smooth functions help us understand inflection points.

We get uncertainty and statistical significance. Each brand, fuel type, and accident status has a confidence interval and p-value, showing whether the effect is real or noise.

We validate assumptions and check generalization. The residuals vs. fitted plot is clean and centered, and fitted vs. actual price shows tight alignment. This suggests the model fits well without overfitting — crucial for making reliable business decisions.

Key Insights Discovered from the Model Model Year Effect: More recent cars significantly increase price, especially after 2015. Insight: There’s accelerating value in newer cars — particularly post-2015.

Mileage Effect: Steep decline early on, then it flattens. Insight: Price drops sharply in the first ~50,000 miles, then loses value more slowly.

Horsepower Effect: Increasing returns up to around 600 hp. Insight: Higher horsepower does increase value — but gains taper off.

Engine Size (Liters) Effect: No meaningful effect. Insight: Buyers don’t seem to value bigger engines once horsepower is accounted for.

Brand Effects Effect: Porsche, Lexus, and Land Rover command large premiums. Insight: Brand identity remains a powerful driver of price — some brands consistently outperform others even after accounting for features.

Fuel Type Effect: All alternatives (hybrid, flex fuel) are penalized compared to electric. Insight: Gasoline and hybrid cars have significantly lower prices on average — possibly due to operating costs or market trends.

Accidents Effect: Clean history raises price ~0.04 log points. Insight: Even small accident history differences materially affect price.

Recommendations to the CEO Prioritize Acquiring Newer Cars Focus inventory on vehicles made after 2015 — they command higher prices and the GAM model shows a non-linear value boost in recent years.

Deprioritize High-Mileage Vehicles Avoid sourcing cars over 50,000 miles unless priced steeply lower. The value curve flattens — older cars lose price appeal fast.

Market High-Horsepower Models Strategically Push performance-focused marketing on models with ~300–600 hp. Gains beyond that taper, so there’s a sweet spot in perception.

Don’t Overpay for Bigger Engines Engine size (liters) does not independently increase price. Avoid using this as a pricing justification.

Price Brands Strategically Use the brand coefficients as a guide. Brands like Porsche and Lexus command strong price premiums — this should reflect in sourcing and pricing strategy.

Emphasize Clean Vehicle Histories Cars with no reported accidents command reliably higher prices. Make this a key feature in listings and sourcing criteria.

Watch Fuel Trends There is a consistent penalty for gasoline, flex fuel, and hybrid cars. Explore whether this trend is shifting in your markets and consider adjusting inventory mix accordingly.

These are rather intuitive relationships^^^

Okay now we are going to try to find and justify more important relationships to help the CEO, by just exploring the data in a basic sense. Not just pointing out obvious relationships like mileage reduces price. With that being said, we can confirm things and group similar ideas together. Note: I am going to write a lot in here as an Rmd file, and it will be similar to what I say in my reports as well, so I can organize my thoughts as I go along rather than submitting the code, then having to look back through everything seperately when writing my report.

# Boxplot of price by exterior color
boxplot(price ~ ext_col, data = train, las = 2,
        col = "lightblue", main = "Exterior Color vs Log Price",
        ylab = "Log Price", xlab = "Exterior Color")

# Boxplot of price by interior color
boxplot(price ~ int_col, data = train, las = 2,
        col = "lightpink", main = "Interior Color vs Log Price",
        ylab = "Log Price", xlab = "Interior Color")

Interior Color: Blue and Other interior colors show the highest median log prices. Gray and Beige interiors tend to be associated with lower prices. While Black is a common color, it doesn’t command a premium compared to more unique colors like Blue.

Business Tip: Offer or target vehicles with more unique interior colors (like Blue or even Other) when sourcing inventory. These seem to fetch higher prices on average.

Exterior Color: Orange and Green cars have noticeably higher median prices. Yellow, Brown, and Gold exterior colors seem to be associated with lower price points. Gray, White, and Silver are neutral but fall somewhere in the middle — possibly due to supply volume.

Business Tip: While unconventional, rare exterior colors like Orange or Green could be positioned as premium or limited options. Avoid overstocking Yellow or Gold vehicles unless offered at a steep discount.

# Boxplot of price by brand and transmission
boxplot(price ~ brand:transmission_type, data = train, las = 2,
        col = "lightgreen", main = "Brand & Transmission vs Log Price",
        ylab = "Log Price", xlab = "Brand:Transmission")

Key Takeaways from Brand & Transmission: Luxury Brands + Automatic Transmission (AT): Brands like Mercedes-Benz, Lexus, and Jaguar with AT generally show higher median prices. This aligns with expectations but reinforces that AT is standard in higher-end vehicles.

Manual Transmission (MT) or CVT: Across most brands, MT or CVT options tend to fetch lower prices. This might indicate reduced demand or lower perceived value, especially for mass-market brands.

Hybrid Transmissions (A/T and M/T): This mixed category shows high variance but in some brands (like “Other”), can still yield strong prices. Could reflect niche vehicles or specialty trims.

Brand Matters Most: Even with the same transmission type, there’s a noticeable price gap across brands. For example, a Buick with AT is worth less than a Lexus with AT, highlighting brand equity.

Business Suggestion: Focus on acquiring luxury-brand vehicles with automatic transmissions, as they consistently yield higher resale value. Avoid overstocking manual or CVT variants unless there’s specific demand.

# Boxplot of price by fuel type
boxplot(price ~ fuel_type, data = train, col = "orange",
        main = "Fuel Type vs Log Price", ylab = "Log Price")

This Fuel Type vs Log Price boxplot gives you another potential business insight to bring to the CEO:

Fuel Type and Car Pricing Insights:

Diesel and Hybrid Vehicles: Tend to have higher median prices than other fuel types. Suggests these vehicles retain value better or are generally positioned as premium offerings. This could reflect better fuel economy or their presence in luxury brands.

E85 Flex Fuel Vehicles: Have the lowest median prices, with a narrow interquartile range. These vehicles might be harder to resell or less desirable in this local market. Potential red flag for overstocking.

Gasoline Vehicles: Very wide spread in prices. Makes sense since this category includes everything from budget to luxury.

Notable that while some gas cars reach high log prices, the overall median is lower than Diesel and Hybrid.

Business Suggestion: Encourage the dealership to prioritize Diesel and Hybrid vehicles in their inventory when possible, especially if margins allow. Be cautious with E85 Flex Fuel vehicles, which may depreciate faster or be harder to sell in Pittsburgh.

# Boxplot of price by accident status
boxplot(price ~ accident, data = train, col = "lightgray",
        main = "Accident History vs Log Price", ylab = "Log Price")

This is less surprising, and largely intuitive:

Cars with No Reported Accidents: Show a higher median log price than those with accident history. The range of prices is also slightly broader on the upper end.

Cars with At Least One Accident: Have a lower median price and more compressed price spread. Even a single incident seems to suppress value noticeably.

Business Insight: Accident history is a clear price depressor. The CEO should: Use this as a negotiation tool when sourcing used cars. Be transparent but strategic when selling accident-affected vehicles. Consider highlighting “accident-free” status in marketing — it clearly boosts price perception.

More relationships:

# Engine Size vs. Log Price
library(ggplot2)
ggplot(train, aes(x = liters, y = price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess") +
  labs(title = "Log Price vs. Engine Size (Liters)",
       x = "Engine Size (liters)",
       y = "Log Price")
## `geom_smooth()` using formula = 'y ~ x'

# Interaction: Cylinders and Horsepower
ggplot(train, aes(x = horsepower, y = price, color = factor(cylinders))) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Horsepower vs. Log Price by Cylinder Count",
       x = "Horsepower",
       y = "Log Price",
       color = "Cylinders")
## `geom_smooth()` using formula = 'y ~ x'

# Interaction: Interior + Exterior Color Combinations
library(dplyr)

top_color_combos <- train %>%
  count(ext_col, int_col, sort = TRUE) %>%
  top_n(10, wt = n) %>%
  select(ext_col, int_col)

train %>%
  semi_join(top_color_combos, by = c("ext_col", "int_col")) %>%
  ggplot(aes(x = interaction(ext_col, int_col), y = price)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Top 10 Exterior + Interior Color Combos vs. Log Price",
       x = "Color Combo (Exterior + Interior)",
       y = "Log Price")

# Interaction: Mileage vs. Price by Brand
top_brands <- train %>%
  count(brand, sort = TRUE) %>%
  top_n(6, wt = n) %>%
  pull(brand)

train %>%
  filter(brand %in% top_brands) %>%
  ggplot(aes(x = mileage, y = price, color = brand)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE) +
  labs(title = "Mileage vs. Log Price by Brand",
       x = "Mileage",
       y = "Log Price")
## `geom_smooth()` using formula = 'y ~ x'

# Model Year vs. Price by Transmission Type
ggplot(train, aes(x = model_year, y = price, color = transmission_type)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(method = "loess") +
  labs(title = "Model Year vs. Log Price by Transmission",
       x = "Model Year",
       y = "Log Price",
       color = "Transmission")
## `geom_smooth()` using formula = 'y ~ x'

Transmission Type: Newer cars are generally worth more. However, cars with manual transmissions or both manual and automatic options tend to retain value slightly better than cars with CVT transmissions. CVT cars show flatter price growth, suggesting they depreciate faster.

Brand and Mileage: Cars lose value as mileage increases. But luxury brands like Lexus and Mercedes-Benz hold their value better than economy brands like Ford or Chevrolet, even at high mileage. This suggests brand reliability influences resale price more than mileage alone.

Color Combinations: Neutral color combinations, like white exteriors with gray or black interiors, are associated with higher prices. In contrast, unusual or less common color pairings, like black with beige, show more variability in price and may reduce buyer interest.

Horsepower and Cylinders: Higher horsepower leads to higher prices, but the number of cylinders doesn’t always add value. Some 4-cylinder cars with high horsepower can match or exceed the value of 6- or 8-cylinder vehicles, showing that buyers care about performance and efficiency more than engine size alone.

Engine Size (Liters): Engine size has a nonlinear relationship with price. Price increases up to about 4 liters, then flattens out. Larger engines don’t always lead to higher resale value, possibly due to concerns about fuel economy or maintenance costs.

CEO suggestions:

Prioritize Manual and Mixed Transmission Cars When Possible Manual or dual transmission cars tend to hold their value better than CVT models, especially in newer vehicles. Avoid older CVT cars unless they’re priced low.

Don’t Dismiss High-Mileage Luxury Cars Brands like Lexus and Mercedes-Benz retain their value better than economy brands, even at high mileage. These can be smart buys if they’re well-maintained.

Stick to Clean, Neutral Color Combos Cars with white exteriors and gray or black interiors tend to sell for more. Avoid unusual color combinations unless the price is heavily discounted.

Look Beyond Cylinder Count — Focus on Horsepower Some high-performance 4-cylinder cars are just as valuable as 6- or 8-cylinder ones. Buyers are willing to pay for horsepower, even with smaller engines.

Don’t Overpay for Oversized Engines Larger engines above 4 liters don’t guarantee higher resale prices. Focus on cars with balanced engine size and performance, not just size.

More research:

# Accident History by Brand
library(dplyr)
top_brands <- train %>%
  count(brand, sort = TRUE) %>%
  top_n(6, wt = n) %>%
  pull(brand)

train %>%
  filter(brand %in% top_brands) %>%
  ggplot(aes(x = accident, y = price, fill = accident)) +
  geom_boxplot() +
  facet_wrap(~ brand) +
  labs(title = "Impact of Accident History on Log Price by Brand",
       x = "Accident History", y = "Log Price")

# Outliers: Check Extremely High-Priced Cars
train %>%
  filter(price > 13) %>%
  select(id, brand, model_year, mileage, cylinders, horsepower, liters, fuel_type, ext_col, int_col, accident, transmission_type) %>%
  View()

# Mileage Threshold Effects by Brand
train %>%
  filter(brand %in% top_brands) %>%
  ggplot(aes(x = mileage, y = price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess") +
  facet_wrap(~ brand) +
  labs(title = "Mileage vs. Log Price with Smoothers by Brand",
       x = "Mileage", y = "Log Price")
## `geom_smooth()` using formula = 'y ~ x'

# Brand + Color Interactions
train %>%
  filter(brand %in% top_brands) %>%
  ggplot(aes(x = ext_col, y = price, fill = ext_col)) +
  geom_boxplot() +
  facet_wrap(~ brand) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Exterior Color vs. Log Price by Brand",
       x = "Exterior Color", y = "Log Price")

# Super-Low Mileage Cars
train %>%
  filter(mileage < 5000) %>%
  select(id, brand, model_year, mileage, price, cylinders, liters, accident) %>%
  View()

Inference:

Color Effects Depend on Brand Some brands, like Audi, BMW, and Lexus, show higher prices for neutral colors like gray, white, or silver, while colors like yellow, green, and brown are much more variable and sometimes lower-priced. This means color isn’t just aesthetic — brand and color interact. For example, a black Lexus might do well, but a yellow BMW might not.

CEO Recommendation: Avoid bold colors unless they’re deeply discounted. Stick to safe colors like silver, gray, and white when stocking luxury vehicles.

Mileage-Based Value Drop-offs Differ by Brand The “cliff” effect is real: price drops faster after around 100,000 miles, but some brands soften this drop. Lexus and Mercedes-Benz decline more gradually, while Ford and Toyota show steeper drops.

CEO Recommendation: Set stricter mileage caps when buying Ford or Toyota. You can take more risk with high-mileage Lexus and Mercedes-Benz cars—they retain value longer.

Accident History Impacts Cheaper Brands More For luxury brands like BMW and Mercedes-Benz, accident history has a smaller price penalty. But for Toyota and Ford, the presence of an accident shows a clear drop in log price.

CEO Recommendation: Avoid accident-history vehicles for economy brands. For luxury brands, you can still consider slightly damaged cars if the price is right.

Low-Mileage Vehicles Are a Goldmine Your list of cars with mileage under 5,000 shows they’re mostly 2023 models across multiple brands with high log prices. This includes Ford, BMW, Toyota, Cadillac, and Porsche — all priced near or above log 11.5.

CEO Recommendation: Flag ultra-low mileage vehicles (under 5,000 miles) as high-margin opportunities. They’re nearly new, and buyers are willing to pay close to full value.